We are going to download data from the III Forest Inventory (Murcia)
##load data
FI_Murcia <- read.csv("~/Dropbox/Rcourse/day1/FI_Murcia.csv")
attach(FI_Murcia)
names(FI_Murcia)
## [1] "Species" "Shape" "Height" "Diameter1" "Diameter2"
##specify dataset and mapping
library(ggplot2)
ggplot(data = FI_Murcia,
mapping = aes(x = Diameter1, y = Height))
Map variables
We need to specify what we wanted placed on the graph
##add geom_point
library(ggplot2)
ggplot(data = FI_Murcia,
mapping = aes(x = Diameter1, y = Height))+
geom_point()
We can change the color with color
##make points blue, larger, and semi-transparent
ggplot(data = FI_Murcia,
mapping = aes(x = Diameter1, y = Height)) +
geom_point(color = "cornflowerblue", alpha = .7,
size = 3)
And also we can add a trend line (in this case it’s not really informative :( )
##add a line of best fit
ggplot(data = FI_Murcia,
mapping = aes(x = Diameter1, y = Height)) +
geom_point(color = "cornflowerblue", alpha = .7, size = 3) +
geom_smooth(method = "lm")
Also, we can differentiate different levels to plot them separately
##indicate species using color
FI_Murcia_sub = subset(FI_Murcia, Species == c("Castanea sativa", "Fagus sylvatica", "Pinus nigra", "Quercus robur"))
ggplot(data = FI_Murcia_sub,
mapping = aes(x = Diameter1, y = Height, color = Species)) +
geom_point(alpha = .7, size = 3) +
geom_smooth(method = "lm", se = FALSE, size = 1.5)
And also, we can plot different charts for each species
# reproduce plot for each species
ggplot(data = FI_Murcia_sub,
mapping = aes(x = Diameter1, y = Height)) +
geom_point(alpha = .7) +
geom_smooth(method = "lm", se = FALSE) +
scale_x_continuous(breaks = seq(0, 2, 0.5)) +
scale_y_continuous(breaks = seq(0, 60, 10)) +
scale_color_manual(values = c("indianred3","cornflowerblue")) +
facet_wrap(~Species)
Plot the distribution of species
library(ggplot2)
ggplot(FI_Murcia_sub, aes(x = Species)) +
geom_bar()
Plot the distribution of species with modified colors and labels
ggplot(FI_Murcia_sub, aes(x = Species)) +
geom_bar(fill = "cornflowerblue", color="black") +
labs(x = "Species", y = "Frequency",
title = "Individuals by species")
Plot the distribution as percentages
ggplot(FI_Murcia_sub, aes(x = Species, y = ..count.. / sum(..count..))) +
geom_bar() + labs(x = "Species", y = "Percent",
title = "Individuals by species") +
scale_y_continuous(labels = scales::percent)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Calculate number of participants in each species category
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
plotdata <- FI_Murcia_sub %>%
count(Species)
plotdata
## Species n
## 1 Castanea sativa 27
## 2 Fagus sylvatica 1441
## 3 Pinus nigra 130
## 4 Quercus robur 197
Plot the bars in ascending order
# plot the bars in ascending order
ggplot(plotdata, aes(x = reorder(Species, n), y = n)) +
geom_bar(stat = "identity") +
labs(x = "Species", y = "Frequency",
title = "Individuals by species")
Plot the bars in descending order
ggplot(plotdata, aes(x = reorder(Species, -n), y = n)) +
geom_bar(stat = "identity") +
labs(x = "Species", y = "Frequency",
title = "Individuals by species")
plot the bars with numeric labels
geom_text adds the labels, vjust controls
vertical justification
ggplot(plotdata, aes(x = reorder(Species, n), y = n)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust=-0.5) +
labs(x = "Species", y = "Frequency",
title = "Individuals by species")
plotdata <- FI_Murcia_sub %>% count(Species) %>% mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
Plot the bars as percentages, in descending order with bar labels
ggplot(plotdata, aes(x = reorder(Species, -pct), y = pct)) +
geom_bar(stat = "identity", fill = "indianred3", color = "black") +
geom_text(aes(label = pctlabel), vjust = -0.25) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Species", y = "Percent", title = "Individuals by species")
Horizontal bar chart
ggplot(FI_Murcia_sub, aes(x = Species)) +
geom_bar() +
labs(x = "", y = "Frequency", title = "Individuals by shape") + coord_flip()
Plot the age distribution using a histogram
ggplot(FI_Murcia_sub, aes(x = Height)) + geom_histogram() +
labs(title = "Individuals by Height", x = "Height")
Plot the histogram with blue bars and white borders
ggplot(FI_Murcia_sub, aes(x = Height)) +
geom_histogram(fill = "cornflowerblue", color = "white") +
labs(title = "Individuals by Height", x = "Height")
Plot the histogram with 20 bins
ggplot(FI_Murcia_sub, aes(x = Height)) +
geom_histogram(fill = "cornflowerblue", color = "white", bins = 20) +
labs(title="Individuals by Height", subtitle = "number of bins = 20", x = "Height")
Plot the histogram with a binwidth of 5
ggplot(FI_Murcia_sub, aes(x = Height)) +
geom_histogram(fill = "cornflowerblue", color = "white", binwidth = 5) +
labs(title="Individuals by Height", subtitle = "binwidth = 5 meters", x = "Height")
Plot the histogram with percentages on the y-axis
library(scales)
ggplot(FI_Murcia_sub, aes(x = Height, y= ..count.. / sum(..count..))) +
geom_histogram(fill = "cornflowerblue", color = "white", binwidth = 5) +
labs(title="Individuals by Height", y = "Percent", x = "Height") +
scale_y_continuous(labels = percent)
There are more things to add some “fantasy” to your plots as:
Stacked bar chart
FI_Murcia_sub$Shape<- as.factor(FI_Murcia_sub$Shape)
ggplot(FI_Murcia_sub,
aes(x = Species,
fill = Shape)) + geom_bar(position = "stack")
From the chart, we can see for example, that the most common species is F. sylvatica. An the most common shape for all the species is 2
Stacked is the default, so the last line could have also been written
as geom_bar().
Grouped bar chart
ggplot(FI_Murcia_sub,
aes(x = Species,
fill = Shape)) + geom_bar(position = "dodge")
By default, zero count bars are dropped and the remaining bars are
made wider. This may not be the behavior you want. You can modify this
using the position = position_dodge(preserve = "single")
option.
ggplot(FI_Murcia_sub,
aes(x = Species,
fill = Shape)) +
geom_bar(position = position_dodge(preserve="single"))
Segmented bar chart
FI_Murcia_sub$Shape<- as.factor(FI_Murcia_sub$Shape)
ggplot(FI_Murcia_sub,
aes(x = Species,
fill = Shape)) +
geom_bar(position = "fill") + labs(y="Proportion")
This type of plot is particularly useful if the goal is to compare the percentage of a category in one variable across each level of another variable.
The simplest display of two quantitative variables is a scatterplot, with each variable represented on an axis.
ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) +
geom_point()
Scatterplot with linear fit line
ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) +
geom_point(color="steelblue") +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
Clearly, Height increases with Diameter. However a straight line does not capture this non-linear effect. A polynomial regression line will fit better here.
Scatterplot with quadratic line of best fit
ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) +
geom_point(color="steelblue") +
geom_smooth(method = "lm", formula = y ~ poly(x, 2),
color = "indianred3")
Finally, a smoothed nonparametric fit line can often provide a good
picture of the relationship. The default in
ggplot2 is a loess line which stands for
for locally weighted scatterplot smoothing.
ggplot(FI_Murcia_sub,
aes(x = Diameter1, y = Height)) +
geom_point(color = "steelblue") +
geom_smooth(color = "tomato")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
When one of the two variables represents time, a line plot can be an effective method of displaying relationship.
library(curl)
## Using libcurl 8.4.0 with LibreSSL/3.3.6
rats <- read.csv("~/Dropbox/Rcourse/day1/Rats.csv")
ggplot(rats,
aes(x = time, y = captures)) +
geom_line()
Line plot with points and improved labeling
ggplot(rats,
aes(x = time, y = captures)) +
geom_line(size = 1.5, color = "lightgrey") + geom_point(size = 3, color = "steelblue") +
labs(y = "Individuals captured", x = "Ocasion",
title = "Removal method data", subtitle = "An Attempt to Determine the Absolute Number of Rats on a Given Area",
caption = "Leslie and Davies (1939)")
When plotting the relationship between a categorical variable and a quantitative variable, a large number of graph types are available.
These include bar charts using summary statistics, grouped kernel density plots, side-by-side box plots, side-by-side violin plots, and Cleveland plots.
# calculate mean height for each species
plotdata <- FI_Murcia_sub %>%
group_by(Species) %>%
summarize(mean_Height = mean(Height))
# plot mean height
ggplot(plotdata,
aes(x = Species, y = mean_Height)) +
geom_bar(stat = "identity")
Plot mean heights in a more attractive fashion
library(scales)
ggplot(plotdata,
aes(x = factor(Species,
labels = c("Castanea\nsativa", "Fagus\nsylvatica", "Pinus\nnigra", "Quercus\nrobur")), y = mean_Height)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
geom_text(aes(label = round(mean_Height, 2)),
vjust = -0.2) +
labs(title = "Mean Height by tree species",
subtitle = "National Forest Inventory (Murcia)", x = "",
y = "m")
One can compare groups on a numeric variable by superimposing kernel density plots in a single graph.
By rank using kernel density plots
ggplot(FI_Murcia_sub,
aes(x = Height, fill = Species)) +
geom_density(alpha = 0.4) +
labs(title = "Height distribution by species")
alpha values range from 0 (transparent) to 1
(opaque)
Plot the distribution of height by species using boxplot
ggplot(FI_Murcia_sub,
aes(x = Species, y = Height)) +
geom_boxplot() +
labs(title = "Height distribution by species")
Violin plots are similar to kernel density plots, but are mirrored and rotated 90\(^\circ\) .
ggplot(FI_Murcia_sub,
aes(x = Species, y = Height)) +
geom_violin() +
labs(title = "Height distribution by species")
A useful variation is to superimpose boxplot on violin plots.
ggplot(FI_Murcia_sub,
aes(x = Species, y = Height)) +
geom_violin(fill = "cornflowerblue") +
geom_boxplot(width = .2, fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Height distribution by species")
Cleveland plots are useful when you want to compare a numeric statistic for a large number of groups.
plotdata<-FI_Murcia %>%
group_by(Species) %>%
mutate(Mean = mean(Height, na.rm=TRUE))
ggplot(plotdata,
aes(x = Mean, y = Species)) +
geom_point()
ggplot(plotdata,
aes(x = Mean, y = reorder(Species,Mean))) +
geom_point()
In grouping, the values of the first two variables are mapped to the x and y axes. Then additional variables are mapped to other visual characteristics such as color, shape, size, line type, and transparency. Grouping allows you to plot the data for multiple groups in a single graph.
library(ggplot2)
ggplot(Fires,
aes(x = speed..km.day.1., y=Size..km2.)) +
geom_point() +
labs(title = "Size (km2) by speed (km/day)")
Next, let’s include the Regions, using color.
ggplot(Fires,
aes(x = speed..km.day.1., y=Size..km2.,
color = dominant.spread.direction)) + geom_point() +
labs(title = "Size (km2) by speed (km/day) and region")
Finally, let’s add the shape of the trees, using the shape of the points to indicate shape. We’ll increase the point size and add transparency to make the individual points clearer.
ggplot(Fires,
aes(x = speed..km.day.1., y=Size..km2.,
color = dominant.spread.direction, shape = Region)) + geom_point() +
labs(title = "Size (km2) by speed (km/day), region and dominant spread direction")
ggplot(Fires,
aes(x = speed..km.day.1., y=Size..km2., color = Region)) +
geom_point(alpha = .4, size = 3) +
geom_smooth(se=FALSE, method = "lm", formula = y~poly(x,2), size = 1.5) +
labs(x = "Speed (km/day)", y = "Size (km2)", title = "Size (km2) by speed (km/day) and region",
subtitle = "Summary of Large Forest Fires", y = "", color = "Regions") +
scale_color_brewer(palette = "Set1") + theme_minimal()
Grouping allows you to plot multiple variables in a single graph, using visual characteristics such as color, shape, and size.
In faceting, a graph consists of several separate plots or small multiples, one for each level of a third variable, or combination of variables. It is easiest to understand this with an example.
ggplot(Fires, aes(x = Size..km2.)) + geom_histogram(fill = "cornflowerblue",
color = "white") + facet_wrap(~Region, ncol = 1) +
labs(title = "Size (km2) by species")
The facet_wrap function creates a separate graph for
each species. The ncol option controls the
number of columns.
ggplot(Fires, aes(x = Size..km2.)) +
geom_histogram(fill = "cornflowerblue", color = "white") +
facet_wrap(Region ~ dominant.spread.direction) +
labs(title = "Size histograms by Region")
The format of the facet_grid function is
facet_grid( row variable(s) ~ column variable(s))
# plot total area by year, for each country
ggplot(plotdata, aes(x=year, y = total)) + geom_line(color="grey") +
geom_point(color="blue") + facet_wrap(~Region) + theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Changes in burned area",x = "Year",y = "Total area")
library(rworldmap)
# get map
MyMap <- ggplot() + borders("world", colour="black", fill="grey") + theme_bw()
MyMap
MyMap +
geom_point(data = Fires, aes(x=lon, y=lat),size = 2, color = "red")+
theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25),
panel.background = element_rect(fill = "aliceblue"))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(viridis)
MyMap +
geom_point(data = Fires, aes(x=lon, y=lat, color = Size..km2.),size = 2) +
scale_color_viridis() +
theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25),
panel.background = element_rect(fill = "aliceblue"))
library(viridis)
MyMap + coord_fixed(xlim=c(-10, 37.5), ylim=c(30, 48)) +
geom_point(data = Fires, aes(x=lon, y=lat, color = Size..km2.),size = 2) +
scale_color_viridis() +
theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25),
panel.background = element_rect(fill = "aliceblue"))
MyMap + coord_fixed(xlim=c(-10, 37.5), ylim=c(30, 48)) +
geom_point(data = Fires, aes(x=lon, y=lat, color = Size..km2.),size = 2) +
scale_color_viridis() +
facet_wrap(~year) +
theme(panel.grid.major = element_line(color = "gray60", linetype = "dashed", size = 0.25),
panel.background = element_rect(fill = "aliceblue"))
ggplot(pm10subset, aes(x = day, y = pm10)) +
geom_line() +
labs(title = "Levels of pm10 during 2014", x = "Day of the year", y = "pm10 concentration")
library(scales)
ggplot(pm10subset, aes(x = day, y = pm10)) +
geom_line(color = "indianred3", size=1 ) +
geom_smooth() + labs(title = "Pm10 levels", subtitle = "2014", x = "DOY", y = "pm10 concentration") +
theme_minimal()
# multivariate time series
attach(pm10)
ggplot(pm10,
aes(x=day , y= pm10, color=site)) + geom_line(size=1) +
labs(title = "pm10 levels London",
subtitle = "2014", caption = "source: OpenAir", y = "pm10 concentration") +
theme_minimal() + scale_color_brewer(palette = "Dark2")
ggplot(pm10subset, aes(x = day, y = pm10)) +
geom_area(fill="lightblue", color="black") +
labs(title = "pm10 levels", x = "day", y = "pm10 concentration")
A stacked area chart can be used to show differences between groups over time.
ggplot(pm10, aes(x = day, y = pm10, fill= site)) +
geom_area() +
labs(title = "pm10 levels", x = "day", y = "pm10 concentration")
The levels of the site variable can be reversed using the fct_rev function in the forcats package.
df <- dplyr::select_if(FI_Murcia, is.numeric)
r <- cor(df, use="complete.obs")
round(r,2)
## Shape Height Diameter1 Diameter2
## Shape 1.00 -0.52 -0.22 -0.22
## Height -0.52 1.00 0.67 0.67
## Diameter1 -0.22 0.67 1.00 0.99
## Diameter2 -0.22 0.67 0.99 1.00
library(ggcorrplot)
ggcorrplot(r)
By default, zero count bars are dropped and the remaining bars are
made wider. This may not be the behavior you want. You can modify this
using the position = position_dodge(preserve = "single")
option.
ggcorrplot(r,
hc.order = TRUE,
type = "lower", lab = TRUE)
Height_lm <- lm(Height ~ Diameter1 + Diameter2, data = FI_Murcia)
library(visreg)
#The visreg function takes (1) the model and (2) the variable of interest and plots
#the conditional relationship, controlling for the other variables.
#The option gg = TRUE is used to produce a ggplot2 graph.
visreg(Height_lm, "Diameter1", gg = TRUE) +
labs(title = "Relationship between Height and Diameter1",
caption = "source: NFI Murcia",
y = "Height (m)",
x = "Diameter (m)")
visreg(Chrod_glm, "altitude",
gg = TRUE,
scale="response") + labs(y = "Prob(Presence)", x = "Altitude",
title = "Relationship of Age and Presence")